/*************************************************************************
 * The contents of this file are subject to the MYRICOM MYRINET          *
 * EXPRESS (MX) NETWORKING SOFTWARE AND DOCUMENTATION LICENSE (the       *
 * "License"); User may not use this file except in compliance with the  *
 * License.  The full text of the License can found in LICENSE.TXT       *
 *                                                                       *
 * Software distributed under the License is distributed on an "AS IS"   *
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See  *
 * the License for the specific language governing rights and            *
 * limitations under the License.                                        *
 *                                                                       *
 * Copyright 2003 - 2004 by Myricom, Inc.  All rights reserved.          *
 *************************************************************************/

static const char __idstring[] = "@(#)$Id: mx__partner.c,v 1.48 2006/10/31 07:33:42 loic Exp $";

#include "mx_auto_config.h"
#include "myriexpress.h"
#include "mx__lib_types.h"
#include "mx__partner.h"
#include "mx_byteswap.h"
#include "mx__driver_interface.h"
#include "mx__shmem.h"
#include "mx__lib.h"
#include "mx__debug_dump.h"
#include "mx__requests.h"

/* TODO: Defer copy of event? */
struct mx__early *
mx__partner_insert_early(struct mx__partner *partner,
			 mcp_uevt_msg_t *recv,
			 uint16_t msg_seq,
			 mx__process_recv_msg_t recv_func,
			 uint8_t type, char *data)
{
  struct mx__early_queue_head * elt;
  struct mx__early *early;
  struct mx__early *new;
  uint32_t frame_length;

  MX__FOREACH_PARTNER_EARLY(early, elt, partner)
    if (mx__msg_order(early->msg_seq, msg_seq) > 0)
      break;

  /* Either at end or x is first entry with larger sequence number.
     Insert before this one. */
  new = mx_malloc(sizeof (*new));
  if (new != NULL) {
    mx_memcpy(&(new->recv_tiny), recv, sizeof (mcp_uevt_tiny_t));
    /* EVENT: Copy recv buffer. */
    new->data = data;
    frame_length = ntohs(recv->length);
    if(type == MX_MCP_UEVT_RECV_MEDIUM) {
      mcp_uevt_medium_t *copy = (void*)recv;
      frame_length = ntohs(copy->frame_length);
    } 
    if (data) {
      new->data = mx_malloc(frame_length);
      if (new->data == NULL) {
	mx_free(new);
	return NULL;
      }
      mx_memcpy(new->data, data, frame_length);
    } else {
      mx_assert(data == NULL);
    }
    new->msg_seq = msg_seq;
    new->recv_func = recv_func;
    new->type = type;

    mx__partner_insert_early_after(partner, new, early);
  }

  return new;
}

struct mx__partner *
mx__partner_create(struct mx_endpoint *ep, uint16_t peer_index, uint16_t endpt)
{
  struct mx__partner *p;
  mx_lookup_peer_t x;
  mx_return_t rc;

  p = mx_calloc(1, sizeof (struct mx__partner));
  mx_fixme_assert(p);
  mx__init_partner_request_queue(&p->partialq);
  mx__partner_init_early_queue(p);
  mx__init_partner_request_queue(&p->pendingq);

  p->endpoint_sid_n = -1;
  p->connect_session_n = -1;
  p->best_session_n = -1;
  p->peer_index_n = htons(peer_index);
  p->eid = (uint8_t)endpt;
  /* hack: if we have fw_ack, pretend there is always a lib2lib ack
     pending to prevent the lib from generating any real one */
  p->liback_pending = mx__opt.fw_ack ? (void*)(uintptr_t)-1 : 0;

  if (mx__opt.no_myrinet) {
    p->nic_id = UINT64_C(0x123456789abc);
  } else {
    x.index = peer_index;
    x.board_number = -1; /* not sure why it exists at all */
    rc = mx__peer_index_to_nic_id(ep->handle, &x);
    if (rc != MX_SUCCESS) {
      mx_printf("ERROR:mx__partner_create:Invalid peer: 0x%04x", peer_index);
      mx_fatal("Cannot continue");
    }
    p->nic_id = x.nic_id;
  }

  return p;
}

static int
mx__abort_partner_requests_in_queue(struct mx_endpoint *ep, struct mx__request_queue_head * queue,
				    struct mx__partner *partner,
				    mx_status_code_t status_code, char * reason)
{
  union mx_request * req;
  struct mx__request_queue_head * elt, * next;
  int count = 0;

  MX__FOREACH_REQ_SAFE(req, elt, next, queue) {

    if (req->basic.partner != partner)
      continue;

    if (mx__opt.verbose) {
      mx_printf(reason);
      mx__dump_request(ep, req);
    }

    /* complete with status error */
    mx__abort_sent_request(ep, partner, req, status_code);

    count++;
  }

  return count;
}

void
mx__partner_cleanup(struct mx_endpoint *ep, struct mx__partner *partner, int disconnect)
{
  union mx_request *req;
  uint32_t ctxid;
  int count;

  /*************
   * complete pending send/get with an error status
   * (they should get nacked earlier most of the times)
   */
  count = 0;

  {
    struct mx__partner_request_queue_head * elt, * next;
    MX__FOREACH_PARTNER_REQ_SAFE(req, elt, next, &partner->pendingq) {
      if (mx__opt.verbose) {
	mx_printf("Aborting pending send request 0x%x due to remote peer ", req->basic.send_seq);
	mx__print_partner(partner);
	mx_printf(" disconnected\n");
	mx__dump_request(ep, req);
      }

      /* complete with status error */
      mx__abort_sent_request(ep, partner, req, MX_STATUS_ENDPOINT_UNREACHABLE);
      count++;
    }
  }

  /* scan the queues in case we missed one in the pending array */
  count += mx__abort_partner_requests_in_queue(ep, &ep->send_reqq, partner,
					       MX_STATUS_ENDPOINT_UNREACHABLE,
					       "Aborting queued send request due to remote peer disconnected\n");
  count += mx__abort_partner_requests_in_queue(ep, &ep->resend_list, partner,
					       MX_STATUS_ENDPOINT_UNREACHABLE,
					       "Aborting pending send request due to remote peer disconnected\n");
  count += mx__abort_partner_requests_in_queue(ep, &ep->resend_reqq, partner,
					       MX_STATUS_ENDPOINT_UNREACHABLE,
					       "Aborting requeued send request due to remote peer disconnected\n");
  count += mx__abort_partner_requests_in_queue(ep, &ep->notifying_large_sendq, partner,
					       MX_STATUS_ENDPOINT_UNREACHABLE,
					       "Aborting notifying large send request due to remote peer disconnected\n");
  if (count) {
    mx_printf("Aborted %d send requests due to remote peer ", count);
    mx__print_partner(partner);
    mx_printf(" disconnected\n");
  }

  /***********
   * complete partially received request with an error status
   */
  count = 0;
  while (!mx__isempty_partner_request_queue(&partner->partialq)) {
    req = mx__first_partner_request(&partner->partialq);

    if (mx__opt.verbose) {
      mx_printf("Aborting partially received message due to remote peer disconnected\n");
      mx__dump_request(ep, req);
    }

    /* dequeue and complete with status error */
    mx__received_last_frag(ep, req,
			   1 /* there can't be no frag or the request wouldn't exist */,
			   MX_STATUS_ENDPOINT_UNREACHABLE);
    count++;
  }
  if (count) {
    mx_printf("Dropped %d partially received messages due to remote peer ", count);
    mx__print_partner(partner);
    mx_printf(" disconnected\n");
  }

  /*************
   * drop early fragments
   */
  count = 0;
  while (!mx__isempty_partner_early_queue(partner)) {
    mx__partner_drop_early(mx__partner_first_early(partner));
    count++;
  }
  if (count) {
    mx_printf("Dropped %d early received fragments due to remote peer ", count);
    mx__print_partner(partner);
    mx_printf(" disconnected\n");
  }

  /**************
   * drop unexpected from this peer
   */
  count = 0;
  for(ctxid=0; ctxid < ep->ctxid_max; ctxid++) {
    struct mx__request_queue_head * head = &ep->ctxid[ctxid].unexpq;
    struct mx__request_queue_head * elt, * next;

    MX__FOREACH_REQ_SAFE(req, elt, next, head) {

      if (req->basic.partner != partner)
	continue;

      if (mx__opt.verbose) {
	mx_printf("Drop unexpected message due to remote peer disconnected\n");
	mx__dump_request(ep, req);
      }

      /* drop it and that's it */
      mx__spliceout_request(head, req);
      mx__rl_free(ep, req);
      count++;
    }
  }
  if (count) {
    mx_printf("Dropped %d unexpected messages due to remote peer ", count);
    mx__print_partner(partner);
    mx_printf(" disconnected\n");
  }

  /*******************
   * drop pending pending acks
   */
  if (partner->ack_list.tqe_prev) {
    TAILQ_REMOVE(&ep->partners_to_ack, partner, ack_list);
    partner->ack_list.tqe_prev = NULL;
  }

  /* any pending ack was in resend_reqq, it must have been dropped above */
  mx_assert(!partner->liback_pending);

  /**************
   * change recv_seq to something very different for safety
   */
  if (disconnect)
    partner->recv_seq += MX__SESNO(partner->recv_seq + MX__SESNO_ONE)
		       | MX__SEQNO(partner->recv_seq + 1000);

  /**************
   * reset everything else to zero
   */
  partner->oldest_recv_time = 0;
  partner->last_ack = 0;
  partner->send_seq = 0;
  partner->fully_recv_seq = 0;
  memset(partner->quadrant_count, 0, sizeof(*partner->quadrant_count));
  partner->connect_recvseq = 0;
  partner->connect_sendseq = 0;
  partner->send_acked = 0;
  partner->recv_acked = 0;
  partner->recv_acknum = 0;
  partner->send_acknum = 0;
  partner->endpoint_sid_n = -1;
  partner->connect_session_n = -1;
  partner->best_session_n = -1;
}

/* How disconnection works
 **************************
 * In case a remote endpoint appears to be dead, we need to be able to cleanup
 * things before eventually reconnecting to a new instance of this endpoint.
 * Disconnection is done in mx__partner_cleanup():
 * + It traverse the pending queues of this peer to find all messages that have
 *   been sent once, and complete them with MX_STATUS_ENDPOINT_UNREACHABLE using
 *   mx__abort_sent_request() (see below).
 * + It traverses the various send queues to drop other send messages, that have
 *   been either not sent yet, or sent and acked and waiting for an explicit
 *   reply (especially the large send waiting for notify) and complete them the
 *   same way.
 * + It drops medium messages that have been partially received before the peer
 *   died and complete the associated receive with MX_STATUS_ENDPOINT_UNREACHABLE.
 * + It drops all unexpected and early fragments that came from this peer
 *   without being received and completed.
 * + Finally it clears various things in the partner structure.
 *
 * We try to detect when we need to disconnect:
 * + when process_resend_list() reaches the maximal retransmit, it assumes the
 *   target is gone and stop resending this message. In case the receiver was only
 *   slow but not dead, we cannot let the other message still go since the received
 *   will have a hole in its received seqnums. So we disconnect, which means all
 *   send/receive involving this peer complete with an error status or are dropped.
 * + in we receive a connect request from a peer that was already connect but with
 *   a different session id, we know that the endpoint on this peer is not the same
 *   one. So all pending send/receive from the previous instance of the peer will
 *   never complete. So we disconnect too.
 *
 * However, we cannot detect all case where a node die. For instance, if we send
 * a large request, it is acked, and the target dies during its GET. We are waiting
 * on the notify message, and have no possible timeout since we don't know when
 * matching will occur. In this case, the application might need to call
 * mx_disconnect() directly.
 *
 * mx__abort_sent_request() works by looking at the request type and its state
 * to guess in which queue it is, and thus how to complete it. Then it removes
 * it from the queue, mark it as acked if required (when a seqnum has been allocated
 * and no ack was received).
 * Then, either the request is now in a state as if it has been acked, and 
 * mx__send_acked_and_mcp_complete() will be used to complete it and release
 * resources, or in some special cases mx__send/recv_complete is enough.
 *
 * Receiving a nack during a large GET is a special case. The MCP returns a UEVT_ERROR
 * which acts as both a NACK and a DONE event. But there is no seqnum associated with
 * a large get (only the notify gets a seqnum during a large recv). So we have to release
 * the resources (mcp_handle and rdma window) and complete the request, no need to mark
 * the request as acked as usually. All this is done in mx__process_events instead of
 * mx__abort_sent_request() since it is very different from the common case.
 */
